% ----------------SURVEY PAPERS---------------------
@article{yang-etal-2024-diffusion,
  author       = {{Ling Yang and
                  Zhilong Zhang and
                  Yang Song and
                  Shenda Hong and
                  Runsheng Xu and
                  Yue Zhao and
                  Wentao Zhang and
                  Bin Cui and
                  Ming{-}Hsuan Yang}},
  title        = {{Diffusion Models: {A} Comprehensive Survey of Methods and Applications}},
  journal      = {{ACM} Comput. Surv.},
  volume       = {56},
  number       = {4},
  pages        = {105:1--105:39},
  year         = {2024},
}

@article{croitoru-etal-2023-diffusion,
  author       = {{Florinel{-}Alin Croitoru and
                  Vlad Hondru and
                  Radu Tudor Ionescu and
                  Mubarak Shah}},
  title        = {{Diffusion Models in Vision: {A} Survey}},
  journal      = {TPAMI},
  volume       = {45},
  number       = {9},
  pages        = {10850--10869},
  year         = {2023},
}

@article{zhang-etal-2023-texttoimage,
  author       = {{Chenshuang Zhang and
                  Chaoning Zhang and
                  Mengchun Zhang and
                  In So Kweon}},
  title        = {{Text-to-image Diffusion Models in Generative {AI:} {A} Survey}},
  journal      = {CoRR},
  volume       = {abs/2303.07909},
  year         = {2023},
}

@misc{po-etal-2023-state,
      title={{State of the Art on Diffusion Models for Visual Computing}}, 
      author={{Ryan Po and Wang Yifan and Vladislav Golyanik and Kfir Aberman and Jonathan T. Barron and Amit H. Bermano and Eric Ryan Chan and Tali Dekel and Aleksander Holynski and Angjoo Kanazawa and C. Karen Liu and Lingjie Liu and Ben Mildenhall and Matthias Nießner and Björn Ommer and Christian Theobalt and Peter Wonka and Gordon Wetzstein}},
      year={2023},
      eprint={2310.07204},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}

@article{ulhaq-etal-2022-effiicent,
  author       = {{Anwaar Ulhaq and
                  Naveed Akhtar and
                  Ganna Pogrebna}},
  title        = {{Efficient Diffusion Models for Vision: {A} Survey}},
  journal      = {CoRR},
  volume       = {abs/2210.09292},
  year         = {2022},
}

@misc{cao-etal-2024-controllable,
      title={{Controllable Generation with Text-to-Image Diffusion Models: A Survey}}, 
      author={{Pu Cao and Feng Zhou and Qing Song and Lu Yang}},
      year={2024},
      eprint={2403.04279},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}


% ----------------TEXT-TO-IMAGE GENERATION---------------------
@inproceedings{baraheem-etal-2020-text,
  author       = {{Samah Saeed Baraheem and
                  Trung{-}Nghia Le and
                  Tam V. Nguyen}},
  title        = {{Text-to-Image Synthesis via Aesthetic Layout}},
  booktitle    = {ACM MM},
  pages        = {4485--4487},
  year         = {2020},
}

@inproceedings{ding-etal-2021-cogview,
  author       = {{Ming Ding and
                  Zhuoyi Yang and
                  Wenyi Hong and
                  Wendi Zheng and
                  Chang Zhou and
                  Da Yin and
                  Junyang Lin and
                  Xu Zou and
                  Zhou Shao and
                  Hongxia Yang and
                  Jie Tang}},
  title        = {{CogView: Mastering Text-to-Image Generation via Transformers}},
  booktitle    = {NeurIPS},
  pages        = {19822--19835},
  year         = {2021},
}

@inproceedings{ramesh-etal-2021-dalle1,
  author       = {{Aditya Ramesh and
                  Mikhail Pavlov and
                  Gabriel Goh and
                  Scott Gray and
                  Chelsea Voss and
                  Alec Radford and
                  Mark Chen and
                  Ilya Sutskever}},
  title        = {{Zero-Shot Text-to-Image Generation}},
  booktitle    = {ICML},
  series       = {Proceedings of Machine Learning Research},
  volume       = {139},
  pages        = {8821--8831},
  year         = {2021},
}

@inproceedings{ruan-etal-2021-daegan,
  author       = {{Shulan Ruan and
                  Yong Zhang and
                  Kun Zhang and
                  Yanbo Fan and
                  Fan Tang and
                  Qi Liu and
                  Enhong Chen}},
  title        = {{{DAE-GAN:} Dynamic Aspect-aware {GAN} for Text-to-Image Synthesis}},
  booktitle    = {ICCV},
  pages        = {13940--13949},
  year         = {2021},
}

@inproceedings{wang-etal-2021-cycle,
  author       = {{Hao Wang and
                  Guosheng Lin and
                  Steven C. H. Hoi and
                  Chunyan Miao}},
  title        = {{Cycle-Consistent Inverse {GAN} for Text-to-Image Synthesis}},
  booktitle    = {ACM MM},
  pages        = {630--638},
  year         = {2021},
}

@inproceedings{qiao-etal-2021-rgan,
  author       = {{Yanyuan Qiao and
                  Qi Chen and
                  Chaorui Deng and
                  Ning Ding and
                  Yuankai Qi and
                  Mingkui Tan and
                  Xincheng Ren and
                  Qi Wu}},
  title        = {{{R-GAN:} Exploring Human-like Way for Reasonable Text-to-Image Synthesis
                  via Generative Adversarial Networks}},
  booktitle    = {ACM MM},
  pages        = {2085--2093},
  year         = {2021},
}

@inproceedings{zhang-etal-2021-ufcbert,
  author       = {{Zhu Zhang and
                  Jianxin Ma and
                  Chang Zhou and
                  Rui Men and
                  Zhikang Li and
                  Ming Ding and
                  Jie Tang and
                  Jingren Zhou and
                  Hongxia Yang}},
  title        = {{{UFC-BERT:} Unifying Multi-Modal Controls for Conditional Image Synthesis}},
  booktitle    = {NeurIPS},
  pages        = {27196--27208},
  year         = {2021},
}

@inproceedings{rombach-etal-2022-stable-diffusion,
  author       = {{Robin Rombach and
                  Andreas Blattmann and
                  Dominik Lorenz and
                  Patrick Esser and
                  Bj{\"{o}}rn Ommer}},
  title        = {{High-Resolution Image Synthesis with Latent Diffusion Models}},
  booktitle    = {CVPR},
  pages        = {10674--10685},
  year         = {2022},
}

@inproceedings{gu-etal-2022-vector,
  author       = {{Shuyang Gu and
                  Dong Chen and
                  Jianmin Bao and
                  Fang Wen and
                  Bo Zhang and
                  Dongdong Chen and
                  Lu Yuan and
                  Baining Guo}},
  title        = {{Vector Quantized Diffusion Model for Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {10686--10696},
  year         = {2022},
}

@inproceedings{tao-etal-2022-dfgan,
  author       = {{Ming Tao and
                  Hao Tang and
                  Fei Wu and
                  Xiaoyuan Jing and
                  Bing{-}Kun Bao and
                  Changsheng Xu}},
  title        = {{{DF-GAN:} {A} Simple and Effective Baseline for Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {16494--16504},
  year         = {2022},
}

@inproceedings{zhou-etal-2022-lafite,
  author       = {{Yufan Zhou and
                  Ruiyi Zhang and
                  Changyou Chen and
                  Chunyuan Li and
                  Chris Tensmeyer and
                  Tong Yu and
                  Jiuxiang Gu and
                  Jinhui Xu and
                  Tong Sun}},
  title        = {{Towards Language-Free Training for Text-to-Image Generation}},
  booktitle    = {CVPR},
  pages        = {17886--17896},
  year         = {2022},
}

@inproceedings{wu-etal-2022-text,
  author       = {{Fuxiang Wu and
                  Liu Liu and
                  Fusheng Hao and
                  Fengxiang He and
                  Jun Cheng}},
  title        = {{Text-to-Image Synthesis based on Object-Guided Joint-Decoding Transformer}},
  booktitle    = {CVPR},
  pages        = {18092--18101},
  year         = {2022},
}

@inproceedings{li-etal-2022-stylet2i,
  author       = {{Zhiheng Li and
                  Martin Renqiang Min and
                  Kai Li and
                  Chenliang Xu}},
  title        = {{StyleT2I: Toward Compositional and High-Fidelity Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {18176--18186},
  year         = {2022},
}

@inproceedings{kim-etal-2022-diffusionclip,
  author       = {{Gwanghyun Kim and
                  Taesung Kwon and
                  Jong Chul Ye}},
  title        = {{DiffusionCLIP: Text-Guided Diffusion Models for Robust Image Manipulation}},
  booktitle    = {CVPR},
  pages        = {2416--2425},
  year         = {2022},
}

@inproceedings{ding-etal-2022-cogview2,
  author       = {{Ming Ding and
                  Wendi Zheng and
                  Wenyi Hong and
                  Jie Tang}},
  title        = {{CogView2: Faster and Better Text-to-Image Generation via Hierarchical
                  Transformers}},
  booktitle    = {NeurIPS},
  year         = {2022},
}

@inproceedings{saharia-etal-2022-imagen,
  author       = {{Chitwan Saharia and
                  William Chan and
                  Saurabh Saxena and
                  Lala Li and
                  Jay Whang and
                  Emily L. Denton and
                  Seyed Kamyar Seyed Ghasemipour and
                  Raphael Gontijo Lopes and
                  Burcu Karagol Ayan and
                  Tim Salimans and
                  Jonathan Ho and
                  David J. Fleet and
                  Mohammad Norouzi}},
  title        = {{Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding}},
  booktitle    = {NeurIPS},
  year         = {2022},
}

@article{ramesh-etal-2022-dalle2,
  author       = {{Aditya Ramesh and
                  Prafulla Dhariwal and
                  Alex Nichol and
                  Casey Chu and
                  Mark Chen}},
  title        = {{Hierarchical Text-Conditional Image Generation with {CLIP} Latents}},
  journal      = {CoRR},
  volume       = {abs/2204.06125},
  year         = {2022},
}

@inproceedings{lee-etal-2022-autoregressive,
  author       = {{Doyup Lee and
                  Chiheon Kim and
                  Saehoon Kim and
                  Minsu Cho and
                  Wook{-}Shin Han}},
  title        = {{Autoregressive Image Generation using Residual Quantization}},
  booktitle    = {CVPR},
  pages        = {11513--11522},
  year         = {2022},
}

@inproceedings{liao-etal-2022-text,
  author       = {{Wentong Liao and
                  Kai Hu and
                  Michael Ying Yang and
                  Bodo Rosenhahn}},
  title        = {{Text to Image Generation with Semantic-Spatial Aware {GAN}}},
  booktitle    = {CVPR},
  pages        = {18166--18175},
  year         = {2022},
}

@inproceedings{gafni-etal-2022-make-a-scene,
  author       = {{Oran Gafni and
                  Adam Polyak and
                  Oron Ashual and
                  Shelly Sheynin and
                  Devi Parikh and
                  Yaniv Taigman}},
  title        = {{Make-A-Scene: Scene-Based Text-to-Image Generation with Human Priors}},
  booktitle    = {ECCV},
  volume       = {13675},
  pages        = {89--106},
  year         = {2022},
}

@inproceedings{yan-etal-2022-trace,
  author       = {{Kun Yan and
                  Lei Ji and
                  Chenfei Wu and
                  Jianmin Bao and
                  Ming Zhou and
                  Nan Duan and
                  Shuai Ma}},
  title        = {{Trace Controlled Text to Image Generation}},
  booktitle    = {ECCV},
  volume       = {13696},
  pages        = {59--75},
  year         = {2022},
}

@inproceedings{lezama-etal-2022-improved,
  author       = {{Jos{\'{e}} Lezama and
                  Huiwen Chang and
                  Lu Jiang and
                  Irfan Essa}},
  title        = {{Improved Masked Image Generation with Token-Critic}},
  booktitle    = {ECCV},
  volume       = {13683},
  pages        = {70--86},
  year         = {2022},
}

@inproceedings{crowson-etal-2022-vqganclip,
  author       = {{Katherine Crowson and
                  Stella Biderman and
                  Daniel Kornis and
                  Dashiell Stander and
                  Eric Hallahan and
                  Louis Castricato and
                  Edward Raff}},
  title        = {{{VQGAN-CLIP:} Open Domain Image Generation and Editing with Natural
                  Language Guidance}},
  booktitle    = {ECCV},
  volume       = {13697},
  pages        = {88--105},
  year         = {2022},
}

@inproceedings{dinh-etal-2022-tise,
  author       = {{Tan M. Dinh and
                  Rang Nguyen and
                  Binh{-}Son Hua}},
  title        = {{{TISE:} Bag of Metrics for Text-to-Image Synthesis Evaluation}},
  booktitle    = {ECCV},
  volume       = {13696},
  pages        = {594--609},
  year         = {2022},
}

@inproceedings{maharana-etal-2022-storydalle,
  author       = {{Adyasha Maharana and
                  Darryl Hannan and
                  Mohit Bansal}},
  title        = {{StoryDALL-E: Adapting Pretrained Text-to-Image Transformers for Story
                  Continuation}},
  booktitle    = {ECCV},
  volume       = {13697},
  pages        = {70--87},
  year         = {2022},
}

@inproceedings{wu-etal-2022-admagan,
  author       = {{Xintian Wu and
                  Hanbin Zhao and
                  Liangli Zheng and
                  Shouhong Ding and
                  Xi Li}},
  title        = {{Adma-GAN: Attribute-Driven Memory Augmented GANs for Text-to-Image
                  Generation}},
  booktitle    = {ACM MM},
  pages        = {1593--1602},
  publisher    = {{ACM}},
  year         = {2022},
}

@inproceedings{chen-etal-2022-background,
  author       = {{Zhuowei Chen and
                  Zhendong Mao and
                  Shancheng Fang and
                  Bo Hu}},
  title        = {{Background Layout Generation and Object Knowledge Transfer for Text-to-Image
                  Generation}},
  booktitle    = {ACM MM},
  pages        = {4327--4335},
  year         = {2022},
}

@inproceedings{huang-etal-2022-dsegan,
  author       = {{Mengqi Huang and
                  Zhendong Mao and
                  Penghui Wang and
                  Quan Wang and
                  Yongdong Zhang}},
  title        = {{{DSE-GAN:} Dynamic Semantic Evolution Generative Adversarial Network
                  for Text-to-Image Generation}},
  booktitle    = {ACM MM},
  pages        = {4345--4354},
  year         = {2022},
}

@inproceedings{shi-etal-2022-athom,
  author       = {{Zhenbo Shi and
                  Zhi Chen and
                  Zhenbo Xu and
                  Wei Yang and
                  Liusheng Huang}},
  title        = {{AtHom: Two Divergent Attentions Stimulated By Homomorphic Training
                  in Text-to-Image Synthesis}},
  booktitle    = {ACM MM},
  pages        = {2211--2219},
  year         = {2022},
}

@inproceedings{xu2023imagereward,
  title={{ImageReward: Learning and Evaluating Human Preferences for Text-to-Image Generation}},
  author={{Jiazheng Xu and Xiao Liu and Yuchen Wu and Yuxuan Tong and Qinkai Li and Ming Ding and Jie Tang and Yuxiao Dong}},
  booktitle={NeurIPS},
  year={2023},
}

@inproceedings{betker-etal-2023-dalle3,
  title={{Improving Image Generation with Better Captions}},
  author={{James Betker and Gabriel Goh and Li Jing and † TimBrooks and Jianfeng Wang and Linjie Li and † LongOuyang and † JuntangZhuang and † JoyceLee and † YufeiGuo and † WesamManassra and † PrafullaDhariwal and † CaseyChu and † YunxinJiao and Aditya Ramesh}},
}

@article{voynov-etal-2023-promptplus,
  author       = {{Andrey Voynov and
                  Qinghao Chu and
                  Daniel Cohen{-}Or and
                  Kfir Aberman}},
  title        = {{{P+:} Extended Textual Conditioning in Text-to-Image Generation}},
  journal      = {CoRR},
  volume       = {abs/2303.09522},
  year         = {2023},
}

@article{podell-etal-2023-sdxl,
  author       = {{Dustin Podell and
                  Zion English and
                  Kyle Lacey and
                  Andreas Blattmann and
                  Tim Dockhorn and
                  Jonas M{\"{u}}ller and
                  Joe Penna and
                  Robin Rombach}},
  title        = {{{SDXL:} Improving Latent Diffusion Models for High-Resolution Image
                  Synthesis}},
  journal      = {CoRR},
  volume       = {abs/2307.01952},
  year         = {2023},
}

@article{sauer-etal-2023-sdxl-turbo,
  author       = {{Axel Sauer and
                  Dominik Lorenz and
                  Andreas Blattmann and
                  Robin Rombach}},
  title        = {{Adversarial Diffusion Distillation}},
  journal      = {CoRR},
  volume       = {abs/2311.17042},
  year         = {2023},
}

@inproceedings{sauer-etal-2023-stylegant,
  author       = {{Axel Sauer and
                  Tero Karras and
                  Samuli Laine and
                  Andreas Geiger and
                  Timo Aila}},
  title        = {{StyleGAN-T: Unlocking the Power of GANs for Fast Large-Scale Text-to-Image
                  Synthesis}},
  booktitle    = {ICML},
  volume       = {202},
  pages        = {30105--30118},
  year         = {2023},
}

@inproceedings{kang-etal-2023-gigagan,
  author       = {{Minguk Kang and
                  Jun{-}Yan Zhu and
                  Richard Zhang and
                  Jaesik Park and
                  Eli Shechtman and
                  Sylvain Paris and
                  Taesung Park}},
  title        = {{Scaling up GANs for Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {10124--10134},
  year         = {2023},
}

@misc{pernias-etal-2023-wuerstchen,
      title={{Wuerstchen: An Efficient Architecture for Large-Scale Text-to-Image Diffusion Models}}, 
      author={{Pablo Pernias and Dominic Rampas and Mats L. Richter and Christopher J. Pal and Marc Aubreville}},
      year={2023},
      eprint={2306.00637},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{feng-etal-2023-ernie-vilg,
  author       = {{Zhida Feng and
                  Zhenyu Zhang and
                  Xintong Yu and
                  Yewei Fang and
                  Lanxin Li and
                  Xuyi Chen and
                  Yuxiang Lu and
                  Jiaxiang Liu and
                  Weichong Yin and
                  Shikun Feng and
                  Yu Sun and
                  Li Chen and
                  Hao Tian and
                  Hua Wu and
                  Haifeng Wang}},
  title        = {{ERNIE-ViLG 2.0: Improving Text-to-Image Diffusion Model with Knowledge-Enhanced
                  Mixture-of-Denoising-Experts}},
  booktitle    = {CVPR},
  pages        = {10135--10145},
  year         = {2023},
}

@inproceedings{zhong-etal-2023-sur-adapter,
  author       = {{Shanshan Zhong and
                  Zhongzhan Huang and
                  Wushao Wen and
                  Jinghui Qin and
                  Liang Lin}},
  title        = {{SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models
                  with Large Language Models}},
  booktitle    = {ACM MM},
  pages        = {567--578},
  year         = {2023},
}

@article{chefer-etal-2023-attend-and-excite,
  author       = {{Hila Chefer and
                  Yuval Alaluf and
                  Yael Vinker and
                  Lior Wolf and
                  Daniel Cohen{-}Or}},
  title        = {{Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image
                  Diffusion Models}},
  journal      = {SIGGRAPH},
  pages        = {148:1--148:10},
  year         = {2023},
}

@inproceedings{chang-etal-2023-muse,
  author       = {{Huiwen Chang and
                  Han Zhang and
                  Jarred Barber and
                  Aaron Maschinot and
                  Jos{\'{e}} Lezama and
                  Lu Jiang and
                  Ming{-}Hsuan Yang and
                  Kevin Patrick Murphy and
                  William T. Freeman and
                  Michael Rubinstein and
                  Yuanzhen Li and
                  Dilip Krishnan}},
  title        = {{Muse: Text-To-Image Generation via Masked Generative Transformers}},
  booktitle    = {ICML},
  pages        = {4055--4075},
  year         = {2023},
}

@inproceedings{zhou-etal-2023-shifted,
  author       = {{Yufan Zhou and
                  Bingchen Liu and
                  Yizhe Zhu and
                  Xiao Yang and
                  Changyou Chen and
                  Jinhui Xu}},
  title        = {{Shifted Diffusion for Text-to-image Generation}},
  booktitle    = {CVPR},
  pages        = {10157--10166},
  year         = {2023},
}

@inproceedings{tao-etal-2023-galip,
  author       = {{Ming Tao and
                  Bing{-}Kun Bao and
                  Hao Tang and
                  Changsheng Xu}},
  title        = {{{GALIP:} Generative Adversarial CLIPs for Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {14214--14223},
  year         = {2023},
}

@inproceedings{lu-etal-2023-specialist-diffusion,
  author       = {{Haoming Lu and
                  Hazarapet Tunanyan and
                  Kai Wang and
                  Shant Navasardyan and
                  Zhangyang Wang and
                  Humphrey Shi}},
  title        = {{Specialist Diffusion: Plug-and-Play Sample-Efficient Fine-Tuning of
                  Text-to-Image Diffusion Models to Learn Any Unseen Style}},
  booktitle    = {CVPR},
  pages        = {14267--14276},
  year         = {2023},
}

@inproceedings{otani-etal-2023-toward,
  author       = {{Mayu Otani and
                  Riku Togashi and
                  Yu Sawai and
                  Ryosuke Ishigami and
                  Yuta Nakashima and
                  Esa Rahtu and
                  Janne Heikkil{\"{a}} and
                  Shin'ichi Satoh}},
  title        = {{Toward Verifiable and Reproducible Human Evaluation for Text-to-Image
                  Generation}},
  booktitle    = {CVPR},
  pages        = {14277--14286},
  year         = {2023},
}

@inproceedings{liu-etal-2023-riatig,
  author       = {{Han Liu and
                  Yuhao Wu and
                  Shixuan Zhai and
                  Bo Yuan and
                  Ning Zhang}},
  title        = {{{RIATIG:} Reliable and Imperceptible Adversarial Text-to-Image Generation
                  with Natural Prompts}},
  booktitle    = {CVPR},
  pages        = {20585--20594},
  year         = {2023},
}

@misc{kodaira-etal-2023-streamdiffusion,
      title={{StreamDiffusion: A Pipeline-level Solution for Real-time Interactive Generation}}, 
      author={{Akio Kodaira and Chenfeng Xu and Toshiki Hazama and Takanori Yoshimoto and Kohei Ohno and Shogo Mitsuhori and Soichi Sugano and Hanying Cho and Zhijian Liu and Kurt Keutzer}},
      year={2023},
      eprint={2312.12491},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{chen-etal-2023-controlstyle,
  author       = {{Jingwen Chen and
                  Yingwei Pan and
                  Ting Yao and
                  Tao Mei}},
  title        = {{ControlStyle: Text-Driven Stylized Image Generation Using Diffusion
                  Priors}},
  booktitle    = {ACM MM},
  pages        = {7540--7548},
  year         = {2023},
}

@inproceedings{xue-etal-2023-raphael,
  author       = {{Zeyue Xue and
                  Guanglu Song and
                  Qiushan Guo and
                  Boxiao Liu and
                  Zhuofan Zong and
                  Yu Liu and
                  Ping Luo}},
  title        = {{{RAPHAEL:} Text-to-Image Generation via Large Mixture of Diffusion
                  Paths}},
  booktitle    = {NeurIPS},
  year         = {2023},
}

@misc{yuan-etal-2024-selfplay,
      title={{Self-Play Fine-Tuning of Diffusion Models for Text-to-Image Generation}}, 
      author={{Huizhuo Yuan and Zixiang Chen and Kaixuan Ji and Quanquan Gu}},
      year={2024},
      eprint={2402.10210},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{yang-etal-2024-rpg,
      title={{Mastering Text-to-Image Diffusion: Recaptioning, Planning, and Generating with Multimodal LLMs}}, 
      author={{Ling Yang and Zhaochen Yu and Chenlin Meng and Minkai Xu and Stefano Ermon and Bin Cui}},
      year={2024},
      eprint={2401.11708},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{li-etal-2024-playground,
      title={{Playground v2.5: Three Insights towards Enhancing Aesthetic Quality in Text-to-Image Generation}}, 
      author={{Daiqing Li and Aleks Kamko and Ehsan Akhgari and Ali Sabet and Linmiao Xu and Suhail Doshi}},
      year={2024},
      eprint={2402.17245},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{li-etal-2024-distrifusion,
      title={{DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models}}, 
      author={{Muyang Li and Tianle Cai and Jiaxin Cao and Qinsheng Zhang and Han Cai and Junjie Bai and Yangqing Jia and Ming-Yu Liu and Kai Li and Song Han}},
      year={2024},
      eprint={2402.19481},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2024-instancediffusion,
      title={{InstanceDiffusion: Instance-level Control for Image Generation}}, 
      author={{Xudong Wang and Trevor Darrell and Sai Saketh Rambhatla and Rohit Girdhar and Ishan Misra}},
      year={2024},
      eprint={2402.03290},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{patel-etal-2023-eclipse,
      title={{ECLIPSE: A Resource-Efficient Text-to-Image Prior for Image Generations}}, 
      author={{Maitreya Patel and Changhoon Kim and Sheng Cheng and Chitta Baral and Yezhou Yang}},
      year={2023},
      eprint={2312.04655},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{hu-etal-2024-instructimagen,
      title={{Instruct-Imagen: Image Generation with Multi-modal Instruction}}, 
      author={{Hexiang Hu and Kelvin C. K. Chan and Yu-Chuan Su and Wenhu Chen and Yandong Li and Kihyuk Sohn and Yang Zhao and Xue Ben and Boqing Gong and William Cohen and Ming-Wei Chang and Xuhui Jia}},
      year={2024},
      eprint={2401.01952},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@article{cheng-etal-2023-learning,
  title={{Learning Continuous 3D Words for Text-to-Image Generation}},
  author={{Cheng, Ta-Ying and Gadelha, Matheus and Groueix, Thibault and Fisher, Matthew and Mech, Radomir and Markham, Andrew and Trigoni, Niki}},
  booktitle={arXiv},
  year={2024}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{narasimhaswamy-etal-2024-handiffuser,
      title={{HanDiffuser: Text-to-Image Generation With Realistic Hand Appearances}}, 
      author={{Supreeth Narasimhaswamy and Uttaran Bhattacharya and Xiang Chen and Ishita Dasgupta and Saayan Mitra and Minh Hoai}},
      year={2024},
      eprint={2403.01693},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{liang-etal-2023-rich,
      title={{Rich Human Feedback for Text-to-Image Generation}}, 
      author={{Youwei Liang and Junfeng He and Gang Li and Peizhao Li and Arseniy Klimovskiy and Nicholas Carolan and Jiao Sun and Jordi Pont-Tuset and Sarah Young and Feng Yang and Junjie Ke and Krishnamurthy Dj Dvijotham and Katie Collins and Yiwen Luo and Yang Li and Kai J Kohlhoff and Deepak Ramachandran and Vidhya Navalpakkam}},
      year={2023},
      eprint={2312.10240},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{jayasumana-etal-2023-markovgen,
      title={{MarkovGen: Structured Prediction for Efficient Text-to-Image Generation}}, 
      author={{Sadeep Jayasumana and Daniel Glasner and Srikumar Ramalingam and Andreas Veit and Ayan Chakrabarti and Sanjiv Kumar}},
      year={2023},
      eprint={2308.10997},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zhou-etal-2023-customization,
      title={{Customization Assistant for Text-to-image Generation}}, 
      author={{Yufan Zhou and Ruiyi Zhang and Jiuxiang Gu and Tong Sun}},
      year={2023},
      eprint={2312.03045},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{li-etal-2023-stylegan,
      title={{When StyleGAN Meets Stable Diffusion: a $\mathscr{W}_+$ Adapter for Personalized Image Generation}}, 
      author={{Xiaoming Li and Xinyu Hou and Chen Change Loy}},
      year={2023},
      eprint={2311.17461},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{huang-etal-2023-learning,
      title={{Learning Disentangled Identifiers for Action-Customized Text-to-Image Generation}}, 
      author={{Siteng Huang and Biao Gong and Yutong Feng and Xi Chen and Yuqian Fu and Yu Liu and Donglin Wang}},
      year={2023},
      eprint={2311.15841},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{xu-etal-2023-ufogen,
      title={{UFOGen: You Forward Once Large Scale Text-to-Image Generation via Diffusion GANs}}, 
      author={{Yanwu Xu and Yang Zhao and Zhisheng Xiao and Tingbo Hou}},
      year={2023},
      eprint={2311.09257},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{li-etal-2023-selfdiscovering,
      title={{Self-Discovering Interpretable Diffusion Latent Directions for Responsible Text-to-Image Generation}}, 
      author={{Hang Li and Chengzhi Shen and Philip Torr and Volker Tresp and Jindong Gu}},
      year={2023},
      eprint={2311.17216},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{mei-etal-2024-codi,
      title={{CoDi: Conditional Diffusion Distillation for Higher-Fidelity and Faster Image Generation}}, 
      author={{Kangfu Mei and Mauricio Delbracio and Hossein Talebi and Zhengzhong Tu and Vishal M. Patel and Peyman Milanfar}},
      year={2024},
      eprint={2310.01407},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{ding-etal-2024-patched,
title={{Patched Denoising Diffusion Models For High-Resolution Image Synthesis}},
author={{Zheng Ding and Mengqi Zhang and Jiajun Wu and Zhuowen Tu}},
booktitle={ICLR},
year={2024},
pages={1--18}
}

@inproceedings{teng-etal-2024-relaydiffusion,
title={Relay Diffusion: Unifying diffusion process across resolutions for image synthesis},
author={Jiayan Teng and Wendi Zheng and Ming Ding and Wenyi Hong and Jianqiao Wangni and Zhuoyi Yang and Jie Tang},
booktitle={ICLR},
year={2024},
pages={1--18}
}

@inproceedings{podell-etal-2024-sdxl,
title={{{SDXL}: Improving Latent Diffusion Models for High-Resolution Image Synthesis}},
author={{Dustin Podell and Zion English and Kyle Lacey and Andreas Blattmann and Tim Dockhorn and Jonas M{\"u}ller and Joe Penna and Robin Rombach}},
booktitle={ICLR},
year={2024},
pages={1--13}
}

@misc{lee-etal-2024-composeandconquer,
      title={{Compose and Conquer: Diffusion-Based 3D Depth Aware Composable Image Synthesis}}, 
      author={{Jonghyun Lee and Hansam Cho and Youngjoon Yoo and Seoung Bum Kim and Yonghyun Jeong}},
      year={2024},
      eprint={2401.09048},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{tan-etal-2023-semanticaware,
      title={{Semantic-aware Data Augmentation for Text-to-image Synthesis}}, 
      author={{Zhaorui Tan and Xi Yang and Kaizhu Huang}},
      year={2023},
      eprint={2312.07951},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{cheng-etal-2024-resadapter,
      title={{ResAdapter: Domain Consistent Resolution Adapter for Diffusion Models}}, 
      author={{Jiaxiang Cheng and Pan Xie and Xin Xia and Jiashi Li and Jie Wu and Yuxi Ren and Huixia Li and Xuefeng Xiao and Min Zheng and Lean Fu}},
      year={2024},
      eprint={2403.02084},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{fei-etal-2024-dis,
      title={{Scalable Diffusion Models with State Space Backbone}}, 
      author={{Zhengcong Fei and Mingyuan Fan and Changqian Yu and Junshi Huang}},
      year={2024},
      eprint={2402.05608},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wang-etal-2024-instantid,
      title={{InstantID: Zero-shot Identity-Preserving Generation in Seconds}}, 
      author={{Qixun Wang and Xu Bai and Haofan Wang and Zekui Qin and Anthony Chen and Huaxia Li and Xu Tang and Yao Hu}},
      year={2024},
      eprint={2401.07519},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{chen-etal-2024-pixartdelta,
      title={{PIXART-{\delta}: Fast and Controllable Image Generation with Latent Consistency Models}}, 
      author={{Junsong Chen and Yue Wu and Simian Luo and Enze Xie and Sayak Paul and Ping Luo and Hang Zhao and Zhenguo Li}},
      year={2024},
      eprint={2401.05252},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{chen-etal-2024-pixartalpha,
title={{PixArt-\${\textbackslash}alpha\$: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis}},
author={{Junsong Chen and Jincheng YU and Chongjian GE and Lewei Yao and Enze Xie and Zhongdao Wang and James Kwok and Ping Luo and Huchuan Lu and Zhenguo Li}},
booktitle={ICLR},
year={2024},
pages={1--31}
}

@misc{chen-etal-2024-pixartsigma,
      title={{PixArt-\Sigma: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation}}, 
      author={{Junsong Chen and Chongjian Ge and Enze Xie and Yue Wu and Lewei Yao and Xiaozhe Ren and Zhongdao Wang and Ping Luo and Huchuan Lu and Zhenguo Li}},
      year={2024},
      eprint={2403.04692},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zheng-etal-2024-cogview3,
      title={{CogView3: Finer and Faster Text-to-Image Generation via Relay Diffusion}}, 
      author={{Wendi Zheng and Jiayan Teng and Zhuoyi Yang and Weihan Wang and Jidong Chen and Xiaotao Gu and Yuxiao Dong and Ming Ding and Jie Tang}},
      year={2024},
      eprint={2403.05121},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{hu-etal-2024-ella,
      title={{ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment}}, 
      author={{Xiwei Hu and Rui Wang and Yixiao Fang and Bin Fu and Pei Cheng and Gang Yu}},
      year={2024},
      eprint={2403.05135},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{su-etal-2024-text2street,
      title={{Text2Street: Controllable Text-to-image Generation for Street Views}}, 
      author={{Jinming Su and Songen Gu and Yiting Duan and Xingyue Chen and Junfeng Luo}},
      year={2024},
      eprint={2402.04504},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% ----------------CONDITIONAL TEXT-TO-IMAGE GENERATION---------------------

@article{wang-etal-2023-piti,
  author       = {{Tengfei Wang and
                  Ting Zhang and
                  Bo Zhang and
                  Hao Ouyang and
                  Dong Chen and
                  Qifeng Chen and
                  Fang Wen}},
  title        = {{Pretraining is All You Need for Image-to-Image Translation}},
  year         = {2022},
}

@inproceedings{li-etal-2023-gligen,
  author       = {{Yuheng Li and
                  Haotian Liu and
                  Qingyang Wu and
                  Fangzhou Mu and
                  Jianwei Yang and
                  Jianfeng Gao and
                  Chunyuan Li and
                  Yong Jae Lee}},
  title        = {{{GLIGEN:} Open-Set Grounded Text-to-Image Generation}},
  booktitle    = {CVPR},
  pages        = {22511--22521},
  year         = {2023},
}

@inproceedings{zhang-etal-2023-controlnet,
  author       = {{Lvmin Zhang and
                  Anyi Rao and
                  Maneesh Agrawala}},
  title        = {{Adding Conditional Control to Text-to-Image Diffusion Models}},
  booktitle    = {ICCV},
  pages        = {3813--3824},
  year         = {2023},
}

@article{mou-etal-2023-t2i-adapter,
  author       = {{Chong Mou and
                  Xintao Wang and
                  Liangbin Xie and
                  Jian Zhang and
                  Zhongang Qi and
                  Ying Shan and
                  Xiaohu Qie}},
  title        = {{T2I-Adapter: Learning Adapters to Dig out More Controllable Ability
                  for Text-to-Image Diffusion Models}},
  journal      = {CoRR},
  year         = {2023},
}

@inproceedings{huang-etal-2023-composer,
  author       = {{Lianghua Huang and
                  Di Chen and
                  Yu Liu and
                  Yujun Shen and
                  Deli Zhao and
                  Jingren Zhou}},
  title        = {{Composer: Creative and Controllable Image Synthesis with Composable
                  Conditions}},
  booktitle    = {ICML},
  pages        = {13753--13773},
  year         = {2023},
}

@inproceedings{voynov-etal-2023-sketch,
  author       = {{Andrey Voynov and
                  Kfir Aberman and
                  Daniel Cohen{-}Or}},
  title        = {{Sketch-Guided Text-to-Image Diffusion Models}},
  booktitle    = {SIGGRAPH},
  pages        = {55:1--55:11},
  year         = {2023},
}

@inproceedings{bartal-etal-2023-multidiffusion,
  author       = {{Omer Bar{-}Tal and
                  Lior Yariv and
                  Yaron Lipman and
                  Tali Dekel}},
  title        = {{MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation}},
  booktitle    = {ICML},
  pages        = {1737--1752},
  year         = {2023},}
}

@inproceedings{avrahami-etal-2023-spatext,
  author       = {{Omri Avrahami and
                  Thomas Hayes and
                  Oran Gafni and
                  Sonal Gupta and
                  Yaniv Taigman and
                  Devi Parikh and
                  Dani Lischinski and
                  Ohad Fried and
                  Xi Yin}},
  title        = {{SpaText: Spatio-Textual Representation for Controllable Image Generation}},
  booktitle    = {CVPR},
  pages        = {18370--18380},
  year         = {2023},
}

@misc{zhao-etal-2023-unicontrolnet,
      title={{Uni-ControlNet: All-in-One Control to Text-to-Image Diffusion Models}}, 
      author={{Shihao Zhao and Dongdong Chen and Yen-Chun Chen and Jianmin Bao and Shaozhe Hao and Lu Yuan and Kwan-Yee K. Wong}},
      year={2023},
      eprint={2305.16322},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{wang-etal-2023-incontext,
title={{In-Context Learning Unlocked for Diffusion Models}},
author={{Zhendong Wang and Yifan Jiang and Yadong Lu and yelong shen and Pengcheng He and Weizhu Chen and Zhangyang Wang and Mingyuan Zhou}},
booktitle={NeurIPS},
year={2023},
}

@inproceedings{liu-etal-2023-more,
  author       = {{Xihui Liu and
                  Dong Huk Park and
                  Samaneh Azadi and
                  Gong Zhang and
                  Arman Chopikyan and
                  Yuxiao Hu and
                  Humphrey Shi and
                  Anna Rohrbach and
                  Trevor Darrell}},
  title        = {{More Control for Free! Image Synthesis with Semantic Diffusion Guidance}},
  booktitle    = {WACV},
  pages        = {289--299},
  year         = {2023},
}

@inproceedings{yang-etal-2023-reco,
  author       = {{Zhengyuan Yang and
                  Jianfeng Wang and
                  Zhe Gan and
                  Linjie Li and
                  Kevin Lin and
                  Chenfei Wu and
                  Nan Duan and
                  Zicheng Liu and
                  Ce Liu and
                  Michael Zeng and
                  Lijuan Wang}},
  title        = {{ReCo: Region-Controlled Text-to-Image Generation}},
  booktitle    = {CVPR},
  pages        = {14246--14255},
  year         = {2023},
}

@inproceedings{farshad-etal-2023-scenegenie,
  author       = {{Azade Farshad and
                  Yousef Yeganeh and
                  Yu Chi and
                  Chengzhi Shen and
                  Bj{\"{o}}rn Ommer and
                  Nassir Navab}},
  title        = {{SceneGenie: Scene Graph Guided Diffusion Models for Image Synthesis}},
  booktitle    = {ICCV},
  pages        = {88--98},
  year         = {2023},
}

@misc{li-etal-2023-blipdiffusion,
      title={{BLIP-Diffusion: Pre-trained Subject Representation for Controllable Text-to-Image Generation and Editing}}, 
      author={{Dongxu Li and Junnan Li and Steven C. H. Hoi}},
      year={2023},
      eprint={2305.14720},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{qu-etal-2023-layoutllm-t2i,
  author       = {{Leigang Qu and
                  Shengqiong Wu and
                  Hao Fei and
                  Liqiang Nie and
                  Tat{-}Seng Chua}},
  title        = {{LayoutLLM-T2I: Eliciting Layout Guidance from {LLM} for Text-to-Image
                  Generation}},
  booktitle    = {ACM MM},
  pages        = {643--654},
  year         = {2023},
}

@inproceedings{feng-etal-2023-training,
  author       = {{Weixi Feng and
                  Xuehai He and
                  Tsu{-}Jui Fu and
                  Varun Jampani and
                  Arjun R. Akula and
                  Pradyumna Narayana and
                  Sugato Basu and
                  Xin Eric Wang and
                  William Yang Wang}},
  title        = {{Training-Free Structured Diffusion Guidance for Compositional Text-to-Image
                  Synthesis}},
  booktitle    = {ICLR},
  year         = {2023},
}

@article{chen-etal-2024-training,
  author       = {{Minghao Chen and
                  Iro Laina and
                  Andrea Vedaldi}},
  title        = {{Training-Free Layout Control with Cross-Attention Guidance}},
  journal      = {CoRR},
  volume       = {abs/2304.03373},
  year         = {2024},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{lv-etal-2024-place,
      title={{PLACE: Adaptive Layout-Semantic Fusion for Semantic Image Synthesis}}, 
      author={{Zhengyao Lv and Yuxiang Wei and Wangmeng Zuo and Kwan-Yee K. Wong}},
      year={2024},
      eprint={2403.01852},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{cho-etal-2024-oneshot,
      title={{One-Shot Structure-Aware Stylized Image Synthesis}}, 
      author={{Hansam Cho and Jonghyun Lee and Seunggyu Chang and Yonghyun Jeong}},
      year={2024},
      eprint={2402.17275},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{phung-etal-2023-grounded,
      title={{Grounded Text-to-Image Synthesis with Attention Refocusing}}, 
      author={{Quynh Phung and Songwei Ge and Jia-Bin Huang}},
      year={2023},
      eprint={2306.05427},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{lu-etal-2024-coarsetofine,
      title={{Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis}}, 
      author={{Yanzuo Lu and Manlin Zhang and Andy J Ma and Xiaohua Xie and Jian-Huang Lai}},
      year={2024},
      eprint={2402.18078},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{shen-etal-2024-advancing,
title={{Advancing Pose-Guided Image Synthesis with Progressive Conditional Diffusion Models}},
author={{Fei Shen and Hu Ye and Jun Zhang and Cong Wang and Xiao Han and Yang Wei}},
booktitle={ICLR},
year={2024},
pages={1--19}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{jia-etal-2023-ssmg,
      title={{SSMG: Spatial-Semantic Map Guided Diffusion Model for Free-form Layout-to-Image Generation}}, 
      author={{Chengyou Jia and Minnan Luo and Zhuohang Dang and Guang Dai and Xiaojun Chang and Mengmeng Wang and Jingdong Wang}},
      year={2023},
      eprint={2308.10156},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{wang-etal-2023-compositional,
      title={{Compositional Text-to-Image Synthesis with Attention Map Control of Diffusion Models}}, 
      author={{Ruichen Wang and Zekang Chen and Chen Chen and Jian Ma and Haonan Lu and Xiaodong Lin}},
      year={2023},
      eprint={2305.13921},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% ----------------PERSONALIZED TEXT-TO-IMAGE GENERATION---------------------
@inproceedings{kumari-etal-2023-custom-diffusion,
  author       = {{Nupur Kumari and
                  Bingliang Zhang and
                  Richard Zhang and
                  Eli Shechtman and
                  Jun{-}Yan Zhu}},
  title        = {{Multi-Concept Customization of Text-to-Image Diffusion}},
  booktitle    = {CVPR},
  pages        = {1931--1941},
  year         = {2023},
}

@inproceedings{ruiz-etal-2023-dreambooth,
  author       = {{Nataniel Ruiz and
                  Yuanzhen Li and
                  Varun Jampani and
                  Yael Pritch and
                  Michael Rubinstein and
                  Kfir Aberman}},
  title        = {{DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven
                  Generation}},
  booktitle    = {CVPR},
  pages        = {22500--22510},
  year         = {2023},
}

@inproceedings{wei-etal-2023-elite,
  author       = {{Yuxiang Wei and
                  Yabo Zhang and
                  Zhilong Ji and
                  Jinfeng Bai and
                  Lei Zhang and
                  Wangmeng Zuo}},
  title        = {{{ELITE:} Encoding Visual Concepts into Textual Embeddings for Customized
                  Text-to-Image Generation}},
  booktitle    = {ICCV},
  pages        = {15897--15907},
  year         = {2023},
}

@inproceedings{gal-etal-2023-textual-inversion,
  author       = {{Rinon Gal and
                  Yuval Alaluf and
                  Yuval Atzmon and
                  Or Patashnik and
                  Amit Haim Bermano and
                  Gal Chechik and
                  Daniel Cohen{-}Or}},
  title        = {{An Image is Worth One Word: Personalizing Text-to-Image Generation
                  using Textual Inversion}},
  booktitle    = {ICLR},
  year         = {2023},
}

@inproceedings{avrahami-etal-2023-break-a-scene,
  author       = {{Omri Avrahami and
                  Kfir Aberman and
                  Ohad Fried and
                  Daniel Cohen{-}Or and
                  Dani Lischinski}},
  title        = {{Break-A-Scene: Extracting Multiple Concepts from a Single Image}},
  booktitle    = {SIGGRAPH},
  pages        = {96:1--96:12},
  year         = {2023},
}

@article{shi-etal-2023-instantbooth,
  author       = {{Jing Shi and
                  Wei Xiong and
                  Zhe Lin and
                  Hyun Joon Jung}},
  title        = {{InstantBooth: Personalized Text-to-Image Generation without Test-Time
                  Finetuning}},
  journal      = {CoRR},
  volume       = {abs/2304.03411},
  year         = {2023},
}

@article{gal-etal-2023-encoder,
  author       = {{Rinon Gal and
                  Moab Arar and
                  Yuval Atzmon and
                  Amit H. Bermano and
                  Gal Chechik and
                  Daniel Cohen{-}Or}},
  title        = {{Encoder-based Domain Tuning for Fast Personalization of Text-to-Image
                  Models}},
  journal      = {{ACM} Trans. Graph.},
  volume       = {42},
  number       = {4},
  pages        = {150:1--150:13},
  year         = {2023},
}

@misc{li-etal-2023-photomaker,
      title={{PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding}}, 
      author={{Zhen Li and Mingdeng Cao and Xintao Wang and Zhongang Qi and Ming-Ming Cheng and Ying Shan}},
      year={2023},
      eprint={2312.04461},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{pang-etal-2023-cross,
      title={{Cross Initialization for Personalized Text-to-Image Generation}}, 
      author={{Lianyu Pang and Jian Yin and Haoran Xie and Qiping Wang and Qing Li and Xudong Mao}},
      year={2023},
      eprint={2312.15905},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{hertz-etal-2024-style,
      title={{Style Aligned Image Generation via Shared Attention}}, 
      author={{Amir Hertz and Andrey Voynov and Shlomi Fruchter and Daniel Cohen-Or}},
      year={2024},
      eprint={2312.02133},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2024-highfidelity,
      title={{High-fidelity Person-centric Subject-to-Image Synthesis}}, 
      author={{Yibin Wang and Weizhong Zhang and Jianwei Zheng and Cheng Jin}},
      year={2024},
      eprint={2311.10329},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{cai-etal-2023-decoupled,
      title={{Decoupled Textual Embeddings for Customized Image Generation}}, 
      author={{Yufei Cai and Yuxiang Wei and Zhilong Ji and Jinfeng Bai and Hu Han and Wangmeng Zuo}},
      year={2023},
      eprint={2312.11826},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{huang-etal-2024-realcustom,
      title={{RealCustom: Narrowing Real Text Word for Real-Time Open-Domain Text-to-Image Customization}}, 
      author={{Mengqi Huang and Zhendong Mao and Mingcong Liu and Qian He and Yongdong Zhang}},
      year={2024},
      eprint={2403.00483},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% ----------------TEXT-GUIDED IMAGE EDITING---------------------

@inproceedings{cao-etal-2023-masactrl,
  author       = {{Mingdeng Cao and
                  Xintao Wang and
                  Zhongang Qi and
                  Ying Shan and
                  Xiaohu Qie and
                  Yinqiang Zheng}},
  title        = {{MasaCtrl: Tuning-Free Mutual Self-Attention Control for Consistent
                  Image Synthesis and Editing}},
  booktitle    = {ICCV},
  pages        = {22503--22513},
  year         = {2023},
}

@inproceedings{patashnik-etal-2023-localizing,
  author       = {{Or Patashnik and
                  Daniel Garibi and
                  Idan Azuri and
                  Hadar Averbuch{-}Elor and
                  Daniel Cohen{-}Or}},
  title        = {{Localizing Object-level Shape Variations with Text-to-Image Diffusion
                  Models}},
  booktitle    = {ICCV},
  pages        = {22994--23004},
  year         = {2023},
}

@inproceedings{wu-etal-2023-uncovering,
  author       = {{Qiucheng Wu and
                  Yujian Liu and
                  Handong Zhao and
                  Ajinkya Kale and
                  Trung Bui and
                  Tong Yu and
                  Zhe Lin and
                  Yang Zhang and
                  Shiyu Chang}},
  title        = {{Uncovering the Disentanglement Capability in Text-to-Image Diffusion
                  Models}},
  booktitle    = {CVPR},
  pages        = {1900--1910},
  year         = {2023},
}

@inproceedings{zhang-etal-2023-sine,
  author       = {{Zhixing Zhang and
                  Ligong Han and
                  Arnab Ghosh and
                  Dimitris N. Metaxas and
                  Jian Ren}},
  title        = {{{SINE:} SINgle Image Editing with Text-to-Image Diffusion Models}},
  booktitle    = {CVPR},
  pages        = {6027--6037},
  year         = {2023},
}

@inproceedings{kawar-etal-2023-imagic,
  author       = {{Bahjat Kawar and
                  Shiran Zada and
                  Oran Lang and
                  Omer Tov and
                  Huiwen Chang and
                  Tali Dekel and
                  Inbar Mosseri and
                  Michal Irani}},
  title        = {{Imagic: Text-Based Real Image Editing with Diffusion Models}},
  booktitle    = {CVPR},
  pages        = {6007--6017},
  year         = {2023},
}

@inproceedings{brooks-etal-2023-instructpix2pix,
  author       = {{Tim Brooks and
                  Aleksander Holynski and
                  Alexei A. Efros}},
  title        = {{InstructPix2Pix: Learning to Follow Image Editing Instructions}},
  booktitle    = {CVPR},
  pages        = {18392--18402},
  year         = {2023},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{xu-etal-2023-infedit,
      title={{Inversion-Free Image Editing with Natural Language}}, 
      author={{Sihan Xu and Yidong Huang and Jiayi Pan and Ziqiao Ma and Joyce Chai}},
      year={2023},
      eprint={2312.04965},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{liu-etal-2024-understanding,
      title={{Towards Understanding Cross and Self-Attention in Stable Diffusion for Text-Guided Image Editing}}, 
      author={{Bingyan Liu and Chengyu Wang and Tingfeng Cao and Kui Jia and Jun Huang}},
      year={2024},
      eprint={2403.03431},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{song-etal-2024-doubly,
      title={{Doubly Abductive Counterfactual Inference for Text-based Image Editing}}, 
      author={{Xue Song and Jiequan Cui and Hanwang Zhang and Jingjing Chen and Richang Hong and Yu-Gang Jiang}},
      year={2024},
      eprint={2403.02981},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{guo-etal-2023-focus,
      title={{Focus on Your Instruction: Fine-grained and Multi-instruction Image Editing by Attention Modulation}}, 
      author={{Qin Guo and Tianwei Lin}},
      year={2023},
      eprint={2312.10113},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{nam-etal-2023-contrastive,
      title={{Contrastive Denoising Score for Text-guided Latent Diffusion Image Editing}}, 
      author={{Hyelin Nam and Gihyun Kwon and Geon Yeong Park and Jong Chul Ye}},
      year={2023},
      eprint={2311.18608},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{shi-etal-2023-dragdiffusion,
      title={{DragDiffusion: Harnessing Diffusion Models for Interactive Point-based Image Editing}}, 
      author={{Yujun Shi and Chuhui Xue and Jun Hao Liew and Jiachun Pan and Hanshu Yan and Wenqing Zhang and Vincent Y. F. Tan and Song Bai}},
      year={2023},
      eprint={2306.14435},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{mou-etal-2024-diffeditor,
      title={{DiffEditor: Boosting Accuracy and Flexibility on Diffusion-based Image Editing}}, 
      author={{Chong Mou and Xintao Wang and Jiechong Song and Ying Shan and Jian Zhang}},
      year={2024},
      eprint={2402.02583},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{ling-etal-2023-freedrag,
      title={{FreeDrag: Feature Dragging for Reliable Point-based Image Editing}}, 
      author={{Pengyang Ling and Lin Chen and Pan Zhang and Huaian Chen and Yi Jin and Jinjin Zheng}},
      year={2023},
      eprint={2307.04684},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{lin-etal-2023-textdriven,
      title={{Text-Driven Image Editing via Learnable Regions}}, 
      author={{Yuanze Lin and Yi-Wen Chen and Yi-Hsuan Tsai and Lu Jiang and Ming-Hsuan Yang}},
      year={2023},
      eprint={2311.16432},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{brack-2023-leditsplusplus,
      title={{LEDITS++: Limitless Image Editing using Text-to-Image Models}}, 
      author={{Manuel Brack and Felix Friedrich and Katharina Kornmeier and Linoy Tsaban and Patrick Schramowski and Kristian Kersting and Apolinário Passos}},
      year={2023},
      eprint={2311.16711},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{huang-etal-2023-smartedit,
      title={{SmartEdit: Exploring Complex Instruction-based Image Editing with Multimodal Large Language Models}}, 
      author={{Yuzhou Huang and Liangbin Xie and Xintao Wang and Ziyang Yuan and Xiaodong Cun and Yixiao Ge and Jiantao Zhou and Chao Dong and Rui Huang and Ruimao Zhang and Ying Shan}},
      year={2023},
      eprint={2312.06739},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{nguyen-etal-2024-edit,
      title={{Edit One for All: Interactive Batch Image Editing}}, 
      author={{Thao Nguyen and Utkarsh Ojha and Yuheng Li and Haotian Liu and Yong Jae Lee}},
      year={2024},
      eprint={2401.10219},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{fu-etal-2024=guiding,
title={{Guiding Instruction-based Image Editing via Multimodal Large Language Models}},
author={{Tsu-Jui Fu and Wenze Hu and Xianzhi Du and William Yang Wang and Yinfei Yang and Zhe Gan}},
booktitle={ICLR},
year={2024},
pages={1--24}
}

% NOTE: This reference will be replaced with ICLR version as soon as the metadata is updated
@misc{nie-etal-2024-blessing,
      title={{The Blessing of Randomness: SDE Beats ODE in General Diffusion-based Image Editing}}, 
      author={{Shen Nie and Hanzhong Allan Guo and Cheng Lu and Yuhao Zhou and Chenyu Zheng and Chongxuan Li}},
      year={2024},
      eprint={2311.01410},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{geng-etal-2024-motionguidance,
title={{Motion Guidance: Diffusion-Based Image Editing with Differentiable Motion Estimators}},
author={{Daniel Geng and Andrew Owens}},
booktitle={ICLR},
year={2024},
}

@inproceedings{yang-etal-2024-objectaware,
title={{Object-Aware Inversion and Reassembly for Image Editing}},
author={{Zhen Yang and Ganggui Ding and Wen Wang and Hao Chen and Bohan Zhuang and Chunhua Shen}},
booktitle={ICLR},
year={2024},
pages={1--20}
}

@inproceedings{cho-etal-2024-noisemapguidance,
title={{Noise Map Guidance: Inversion with Spatial Context for Real Image Editing}},
author={{Hansam Cho and Jonghyun Lee and Seoung Bum Kim and Tae-Hyun Oh and Yonghyun Jeong}},
booktitle={ICLR},
year={2024},
pages={1--20}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{duan-etal-2023-tuningfree,
      title={{Tuning-Free Inversion-Enhanced Control for Consistent Image Editing}}, 
      author={{Xiaoyue Duan and Shuhao Cui and Guoliang Kang and Baochang Zhang and Zhengcong Fei and Mingyuan Fan and Junshi Huang}},
      year={2023},
      eprint={2312.14611},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{qiao-etal-2023-baret,
      title={{BARET: Balanced Attention based Real image Editing driven by Target-text Inversion}}, 
      author={{Yuming Qiao and Fanyi Wang and Jingwen Su and Yanhao Zhang and Yunjie Yu and Siyu Wu and Guo-Jun Qi}},
      year={2023},
      eprint={2312.05482},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{yu-etal-2024-accelerating,
      title={{Accelerating Text-to-Image Editing via Cache-Enabled Sparse Diffusion Inference}}, 
      author={{Zihao Yu and Haoyang Li and Fangcheng Fu and Xupeng Miao and Bin Cui}},
      year={2024},
      eprint={2305.17423},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{hou-etal-2024-highfidelity,
      title={{High-Fidelity Diffusion-based Image Editing}}, 
      author={{Chen Hou and Guoqiang Wei and Zhibo Chen}},
      year={2024},
      eprint={2312.15707},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{ma-etal-2023-adapedit,
      title={{AdapEdit: Spatio-Temporal Guided Adaptive Editing Algorithm for Text-Based Continuity-Sensitive Image Editing}}, 
      author={{Zhiyuan Ma and Guoli Jia and Bowen Zhou}},
      year={2023},
      eprint={2312.08019},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{feng-etal-2024-item,
      title={{An Item is Worth a Prompt: Versatile Image Editing with Disentangled Control}}, 
      author={A{osong Feng and Weikang Qiu and Jinbin Bai and Kaicheng Zhou and Zhen Dong and Xiao Zhang and Rex Ying and Leandros Tassiulas}},
      year={2024},
      eprint={2403.04880},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{cui-etal-2024-stabledrag,
      title={{StableDrag: Stable Dragging for Point-based Image Editing}}, 
      author={{Yutao Cui and Xiaotong Zhao and Guozhen Zhang and Shengming Cao and Kai Ma and Limin Wang}},
      year={2024},
      eprint={2403.04437},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{lyu-etal2024-onedimensional,
      title={{One-Dimensional Adapter to Rule Them All: Concepts, Diffusion Models and Erasing Applications}}, 
      author={{Mengyao Lyu and Yuhong Yang and Haiwen Hong and Hui Chen and Xuan Jin and Yuan He and Hui Xue and Jungong Han and Guiguang Ding}},
      year={2024},
      eprint={2312.16145},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% ----------------TEXT IMAGE GENERATION---------------------

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{tuo-etal-2024-anytext,
      title={{AnyText: Multilingual Visual Text Generation And Editing}}, 
      author={{Yuxiang Tuo and Wangmeng Xiang and Jun-Yan He and Yifeng Geng and Xuansong Xie}},
      year={2024},
      eprint={2311.03054},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% ----------------DATASETS---------------------

@inproceedings{lin-etal-2014-mscoco,
  author       = {{Tsung{-}Yi Lin and
                  Michael Maire and
                  Serge J. Belongie and
                  James Hays and
                  Pietro Perona and
                  Deva Ramanan and
                  Piotr Doll{\'{a}}r and
                  C. Lawrence Zitnick}},
  title        = {{Microsoft {COCO:} Common Objects in Context}},
  booktitle    = {ECCV},
  volume       = {8693},
  pages        = {740--755},
  year         = {2014},
}

@inproceedings{sharma-etal-2018-conceptual-captions,
  author       = {{Piyush Sharma and
                  Nan Ding and
                  Sebastian Goodman and
                  Radu Soricut}},
  title        = {{Conceptual Captions: {A} Cleaned, Hypernymed, Image Alt-text Dataset
                  For Automatic Image Captioning}},
  booktitle    = {ACL},
  pages        = {2556--2565},
  year         = {2018},
}

@inproceedings{schuhmann-etal-2022-laion,
  author       = {{Christoph Schuhmann and
                  Romain Beaumont and
                  Richard Vencu and
                  Cade Gordon and
                  Ross Wightman and
                  Mehdi Cherti and
                  Theo Coombes and
                  Aarush Katta and
                  Clayton Mullis and
                  Mitchell Wortsman and
                  Patrick Schramowski and
                  Srivatsa Kundurthy and
                  Katherine Crowson and
                  Ludwig Schmidt and
                  Robert Kaczmarczyk and
                  Jenia Jitsev}},
  title        = {{{LAION-5B:} An Open Large-Scale Dataset for Training Next Generation
                  Image-Text Models}},
  booktitle    = {NeurIPS},
  year         = {2022},
}
